
//-----------------------------------------------------------------------
//
// Gentrification and pioneer businesses
// Kristian Behrens, Brahim Boualam, Julien Martin, and Florian Mayneris
//
// First version: 17/02/2015
// This  version: 17/11/2021
//
// Preparation of the NHGIS census variables for New York
//
//-----------------------------------------------------------------------

// Do-file for concording geographical data, and for apportioning data
// using Census Blocks (either with population weights or proportional to the
// surface of the blocks)


//THIS IS FOR THE INCOME VARIABLES, BROKEN DOWN FROM BLOCK GROUP TO BLOCK


cd "/Users/kristianbehrens/Desktop/GENTRI_RESTAT_FINAL/"


clear
clear matrix

set more off


// This is the list of counties for the NY MSA (we use the 2010 definition)
#delimit ; 

local listcountyNYMSA
34003	36005	36027	34013	34017	34019	36047	34023	34025	34027
36059	36061	34029	36071	34031	42103	36079	36081	36085	36087
34035	36103	34037	34039	36119;

local listvar
001 002 003 004 005;

local years
90 00 10;

#delimit cr 



// ---------------------------------------------------------
// BEGIN: MATCHING OF NHGIS 1990 BLOCKGROUP DATA

// STEP 1

insheet using "data/census/nhgis_income_bgroup/nhgis0012_ds123_1990_blck_grp.csv", delimit(",")

gen temp1 = ""
gen temp2 = ""
gen temp3 = ""
gen temp4 = ""
gen geoid90 = ""

gen test = substr(gisjoin, 2, 100)

if (strlen(gisjoin) == 13) {
	replace temp1 = substr(test, 1, 2)
	replace temp2 = substr(test, 4, 3)
	replace temp3 = substr(test, 8, 4)
	replace temp4 = substr(test, 12, 1)
	replace geoid90 = temp1 + temp2 + temp3 + temp4
}

if (strlen(gisjoin) == 15) {
	replace temp1 = substr(test, 1, 2)
	replace temp2 = substr(test, 4, 3)
	replace temp3 = substr(test, 8, 6)
	replace temp4 = substr(test, 14, 1)
	replace geoid90 = temp1 + temp2 + temp3 + temp4
}

drop temp* test

sort geoid90
drop if geoid90[_n-1] == geoid90[_n]


keep geoid90 e4u001 e01001
ren e4u001 median_income90
ren e01001 percap_income90

// Keep only NYMSA counties
gen ctid = substr(geoid90, 1, 5)
gen temp = 0

foreach i in `listcountyNYMSA' {
	replace temp = 1 if ctid == "`i'"
}

keep if temp == 1

order geoid90 median_income90 percap_income90
drop ctid temp
save "temp/temp_variables.dta", replace


// STEP 2

// Merge with the 1990 geography files
use "data/geography/geog_new_york/geography1990.dta", clear


// Required for 1990 data, takes care of the block suffix
gen full_bg_id = ""
replace full_bg_id = substr(geoid90, 1, strlen(geoid90) - 2) if blocksuffix90 == ""
replace full_bg_id = substr(geoid90, 1, strlen(geoid90) - 3) if blocksuffix90 != ""

gen ctid = substr(geoid90, 1, 5)
gen temp = 0

foreach i in `listcountyNYMSA' {
	replace temp = 1 if ctid == "`i'"
}

keep if temp == 1
drop temp ctid

ren geoid90 geoid90_back
ren full_bg_id geoid90

sort geoid90

merge m:1 geoid90 using "temp/temp_variables.dta"

tab _merge
drop if _merge == 2
drop _merge

ren geoid90 geoid90_blgroup
ren geoid90_back geoid90


gen year = 1990
keep year geoid90 median_income90 percap_income90

ren geoid90 geoid
ren median_income90 median_income_block
ren percap_income90 percap_income_block

order geoid year median_income_block percap_income_block

save "temp/income1990.dta", replace

// END: MATCHING OF NHGIS 1990 BLOCKGROUP DATA
// ---------------------------------------------------------



clear



// ---------------------------------------------------------
// BEGIN: MATCHING OF NHGIS 2000 BLOCKGROUP DATA


// STEP 1: GET THE BLOCKGROUP DATA

insheet using "data/census/nhgis_income_bgroup/nhgis0012_ds152_2000_blck_grp.csv", delimit(",")

gen test = substr(gisjoin, 2, 100)
gen temp1 = substr(test, 1, 2)
gen temp2 = substr(test, 4, 3)
gen temp3 = substr(test, 8, 6)
gen temp4 = substr(test, 14, 1)
gen geoid00 = temp1 + temp2 + temp3 + temp4


drop test temp*
keep geoid00 hf6001 hg4001
ren hf6001 median_income00 
ren hg4001 percap_income00

//Keep only NYMSA counties
gen ctid = substr(geoid00, 1, 5)
gen temp = 0

foreach i in `listcountyNYMSA' {
	replace temp = 1 if ctid == "`i'"
}

keep if temp == 1

order geoid00 median_income00 percap_income00
drop ctid temp
save "temp/temp_variables.dta", replace


//STEP 2: MERGE WITH THE BLOCK GEOGRAPHY FILES

use "data/geography/geog_new_york/geography2000.dta", clear

gen ctid = substr(geoid00, 1, 5)
gen temp = 0

foreach i in `listcountyNYMSA' {
	replace temp = 1 if ctid == "`i'"
}

keep if temp == 1
drop temp ctid

ren geoid00 geoid00_back
gen geoid00 = substr(geoid00_back, 1, 12)

sort geoid00

merge m:1 geoid00 using "temp/temp_variables.dta"

tab _merge
drop _merge

ren geoid00 geoid00_blgroup
ren geoid00_back geoid00


gen year = 2000
keep year geoid00 median_income00 percap_income00

ren geoid00 geoid
ren median_income00 median_income_block
ren percap_income00 percap_income_block

order geoid year median_income_block percap_income_block

save "temp/income2000.dta", replace


// END: MATCHING OF NHGIS 2000 BLOCKGROUP DATA
// ---------------------------------------------------------



clear



// ---------------------------------------------------------
// BEGIN: MATCHING OF NHGIS 2010 BLOCKGROUP DATA

// STEP 1

insheet using "data/census/nhgis_income_bgroup/nhgis0012_ds176_20105_2010_blck_grp.csv", delimit(",")

gen test = substr(gisjoin, 2, 100)
gen temp1 = substr(test, 1, 2)
gen temp2 = substr(test, 4, 3)
gen temp3 = substr(test, 8, 6)
gen temp4 = substr(test, 14, 1)
gen geoid10 = temp1 + temp2 + temp3 + temp4


drop test temp*
keep geoid10 joie001 jqbe001
ren joie001 median_income10 
ren jqbe001 percap_income10

//Keep only NYMSA counties
gen ctid = substr(geoid10, 1, 5)
gen temp = 0

foreach i in `listcountyNYMSA' {
	replace temp = 1 if ctid == "`i'"
}

keep if temp == 1

order geoid10 median_income10 percap_income10
drop ctid temp
save "temp/temp_variables.dta", replace


//STEP 2

//Merge with the 2010 geography files
use "data/geography/geog_new_york/geography2010.dta", clear

gen ctid = substr(geoid10, 1, 5)
gen temp = 0

foreach i in `listcountyNYMSA' {
	replace temp = 1 if ctid == "`i'"
}

keep if temp == 1
drop temp ctid

ren geoid10 geoid10_back
gen geoid10 = substr(geoid10_back, 1, 12)

sort geoid10

merge m:1 geoid10 using "temp/temp_variables.dta"

tab _merge
drop _merge

ren geoid10 geoid10_blgroup
ren geoid10_back geoid10


gen year = 2010
keep year geoid10 median_income10 percap_income10

ren geoid10 geoid
ren median_income10 median_income_block
ren percap_income10 percap_income_block

order geoid year median_income_block percap_income_block

save "temp/income2010.dta", replace


// END: MATCHING OF NHGIS 2010 BLOCKGROUP DATA
// -----------------------------------------------------


// Append the files 

append using "temp/income2000.dta" "temp/income1990.dta"

sort year geoid

save "results/NewYork/census_variables_income.dta", replace


// Adjust the geoid codes for 1990
// The 1990 block data comes from the NHGIS, with NHGIS identifiers
// The concordance data comes from the Census with the Census identifiers

// We need to transform the NHGIS identifiers (in the geography1990.dta
// files to Census identifiers)

gen test1 = substr(geoid, 1, 9)
gen test2 = substr(geoid, 10, 3) if strlen(geoid) == 12
gen test3 = test1 + "00" + test2
replace  geoid = test3 if strlen(geoid) == 12

drop test*

gen test1 = substr(geoid, 1, 9)
gen test2 = substr(geoid, 10, 4) if strlen(geoid) == 13
gen test3 = test1 + "00" + test2
replace  geoid = test3 if strlen(geoid) == 13

drop test*

save "results/NewYork/census_variables_income.dta", replace

